1 Effect of UPSTM-Based Decorrelation on Feature Discovery

1.0.1 Loading the libraries

library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)

op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 Material and Methods

1.2 Data: The COVID_19 Data-Set

The data to process is described in:

https://zenodo.org/record/4156647#.Y1bSF3bMKUk

IR Saliva Testing Dataset

10.5281/zenodo.4156647 https://doi.org/10.5281/zenodo.4156647

I added a column to the data identifying the repeated experiments.


SalivaIR <- as.data.frame(read_excel("~/GitHub/FCA/Data/SalivaThermal_Source_Data_2.xlsx"))


SalivaIR_set1 <- subset(SalivaIR,RepID==1)
rownames(SalivaIR_set1) <- SalivaIR_set1$ID
SalivaIR_set1$RepID <- NULL
SalivaIR_set1$ID <- NULL
SalivaIR_set1$Ct <- NULL

SalivaIR_set2 <- subset(SalivaIR,RepID==2)
rownames(SalivaIR_set2) <- SalivaIR_set2$ID
SalivaIR_set2$RepID <- NULL
SalivaIR_set2$ID <- NULL
SalivaIR_set2$Ct <- NULL

SalivaIR_set3 <- subset(SalivaIR,RepID==3)
rownames(SalivaIR_set3) <- SalivaIR_set3$ID
SalivaIR_set3$RepID <- NULL
SalivaIR_set3$ID <- NULL
SalivaIR_set3$Ct <- NULL

SalivaIR_Avg <- (SalivaIR_set1 + SalivaIR_set2 + SalivaIR_set3)/3


colnames(SalivaIR_Avg) <- paste("V",colnames(SalivaIR_Avg),sep="_")

SalivaIR_Avg$class <- 1*(str_detect(rownames(SalivaIR_Avg),"P"))

pander::pander(table(SalivaIR_Avg$class))
0 1
30 31

1.2.0.1 Standarize the names for the reporting

studyName <- "IRSaliva"
dataframe <- SalivaIR_Avg
outcome <- "class"

TopVariables <- 10

thro <- 0.80
cexheat = 0.15

1.3 Generaring the report

1.3.1 Libraries

Some libraries

library(psych)
library(whitening)
library("vioplot")
library("rpart")

1.3.2 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
61 251
pander::pander(table(dataframe[,outcome]))
0 1
30 31

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

largeSet <- length(varlist) > 1500 

1.3.3 Scaling the data

Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns


  ### Some global cleaning
  sdiszero <- apply(dataframe,2,sd) > 1.0e-16
  dataframe <- dataframe[,sdiszero]

  varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
  tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
  dataframe <- dataframe[,tokeep]

  varlist <- colnames(dataframe)
  varlist <- varlist[varlist != outcome]
  
  iscontinous <- sapply(apply(dataframe,2,unique),length) > 5 ## Only variables with enough samples



dataframeScaled <- FRESAScale(dataframe,method="OrderLogit")$scaledData

1.4 The heatmap of the data

numsub <- nrow(dataframe)
if (numsub > 1000) numsub <- 1000


if (!largeSet)
{

  hm <- heatMaps(data=dataframeScaled[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 xlab="Feature",
                 ylab="Sample",
                 srtCol=45,
                 srtRow=45,
                 cexCol=cexheat,
                 cexRow=cexheat
                 )
  par(op)
}

1.4.0.1 Correlation Matrix of the Data

The heat map of the data


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  #cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
  cormat <- cor(dataframe[,varlist],method="pearson")
  cormat[is.na(cormat)] <- 0
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Original Correlation",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.999994

1.5 The decorrelation


DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#> 
#>  Included: 224 , Uni p: 0.04988877 , Uncorrelated Base: 1 , Outcome-Driven Size: 0 , Base Size: 1 
#> 
#> 
 1 <R=1.000,r=0.975,N=  224>, Top: 2( 53 )[ 1 : 2 Fa= 2 : 0.975 ]( 2 , 147 , 0 ),<|>Tot Used: 149 , Added: 147 , Zero Std: 0 , Max Cor: 1.000
#> 
 2 <R=1.000,r=0.975,N=  224>, Top: 9( 74 )[ 1 : 9 Fa= 11 : 0.975 ]( 9 , 121 , 2 ),<|>Tot Used: 224 , Added: 121 , Zero Std: 0 , Max Cor: 1.000
#> 
 3 <R=1.000,r=0.975,N=  224>, Top: 27( 9 )[ 1 : 27 Fa= 38 : 0.975 ]( 27 , 100 , 11 ),<|>Tot Used: 224 , Added: 100 , Zero Std: 0 , Max Cor: 1.000
#> 
 4 <R=1.000,r=0.975,N=  224>, Top: 46( 5 )=[ 2 : 46 Fa= 83 : 0.989 ]( 45 , 90 , 38 ),<|>Tot Used: 224 , Added: 90 , Zero Std: 0 , Max Cor: 1.000
#> 
 5 <R=1.000,r=0.975,N=  224>, Top: 23( 2 )[ 1 : 23 Fa= 106 : 0.975 ]( 23 , 36 , 83 ),<|>Tot Used: 224 , Added: 36 , Zero Std: 0 , Max Cor: 0.999
#> 
 6 <R=0.999,r=0.974,N=  224>, Top: 11( 1 )[ 1 : 11 Fa= 116 : 0.974 ]( 11 , 12 , 106 ),<|>Tot Used: 224 , Added: 12 , Zero Std: 0 , Max Cor: 0.996
#> 
 7 <R=0.996,r=0.948,N=  118>, Top: 51( 1 )[ 1 : 51 Fa= 125 : 0.948 ]( 50 , 57 , 116 ),<|>Tot Used: 224 , Added: 57 , Zero Std: 0 , Max Cor: 0.998
#> 
 8 <R=0.998,r=0.949,N=  118>, Top: 30( 1 )[ 1 : 30 Fa= 132 : 0.949 ]( 30 , 33 , 125 ),<|>Tot Used: 224 , Added: 33 , Zero Std: 0 , Max Cor: 0.992
#> 
 9 <R=0.992,r=0.946,N=  118>, Top: 19( 1 )[ 1 : 19 Fa= 134 : 0.946 ]( 18 , 21 , 132 ),<|>Tot Used: 224 , Added: 21 , Zero Std: 0 , Max Cor: 0.991
#> 
 10 <R=0.991,r=0.945,N=  118>, Top: 3( 1 )[ 1 : 3 Fa= 134 : 0.945 ]( 3 , 3 , 134 ),<|>Tot Used: 224 , Added: 3 , Zero Std: 0 , Max Cor: 0.966
#> 
 11 <R=0.966,r=0.883,N=  106>, Top: 42( 1 )[ 1 : 42 Fa= 141 : 0.883 ]( 41 , 56 , 134 ),<|>Tot Used: 224 , Added: 56 , Zero Std: 0 , Max Cor: 0.972
#> 
 12 <R=0.972,r=0.886,N=  106>, Top: 13( 1 )[ 1 : 13 Fa= 143 : 0.886 ]( 12 , 13 , 141 ),<|>Tot Used: 224 , Added: 13 , Zero Std: 0 , Max Cor: 0.925
#> 
 13 <R=0.925,r=0.863,N=  106>, Top: 24( 1 )[ 1 : 24 Fa= 145 : 0.863 ]( 23 , 26 , 143 ),<|>Tot Used: 224 , Added: 26 , Zero Std: 0 , Max Cor: 0.992
#> 
 14 <R=0.992,r=0.896,N=  106>, Top: 4( 1 )[ 1 : 4 Fa= 145 : 0.896 ]( 4 , 4 , 145 ),<|>Tot Used: 224 , Added: 4 , Zero Std: 0 , Max Cor: 0.976
#> 
 15 <R=0.976,r=0.838,N=   44>, Top: 21( 1 )[ 1 : 21 Fa= 145 : 0.838 ]( 21 , 23 , 145 ),<|>Tot Used: 224 , Added: 23 , Zero Std: 0 , Max Cor: 0.974
#> 
 16 <R=0.974,r=0.837,N=   44>, Top: 7( 1 )[ 1 : 7 Fa= 146 : 0.837 ]( 7 , 7 , 145 ),<|>Tot Used: 224 , Added: 7 , Zero Std: 0 , Max Cor: 0.894
#> 
 17 <R=0.894,r=0.800,N=   44>, Top: 29( 2 )[ 1 : 29 Fa= 147 : 0.800 ]( 26 , 32 , 146 ),<|>Tot Used: 224 , Added: 32 , Zero Std: 0 , Max Cor: 0.973
#> 
 18 <R=0.973,r=0.837,N=   44>, Top: 5( 1 )[ 1 : 5 Fa= 149 : 0.837 ]( 5 , 6 , 147 ),<|>Tot Used: 224 , Added: 6 , Zero Std: 0 , Max Cor: 0.955
#> 
 19 <R=0.955,r=0.827,N=   44>, Top: 1( 1 )[ 1 : 1 Fa= 149 : 0.827 ]( 1 , 1 , 149 ),<|>Tot Used: 224 , Added: 1 , Zero Std: 0 , Max Cor: 0.825
#> 
 20 <R=0.825,r=0.800,N=    7>, Top: 3( 1 )[ 1 : 3 Fa= 150 : 0.800 ]( 3 , 4 , 149 ),<|>Tot Used: 224 , Added: 4 , Zero Std: 0 , Max Cor: 0.881
#> 
 21 <R=0.881,r=0.800,N=    7>, Top: 2( 1 )[ 1 : 2 Fa= 150 : 0.800 ]( 2 , 2 , 150 ),<|>Tot Used: 224 , Added: 2 , Zero Std: 0 , Max Cor: 0.795
#> 
 22 <R=0.795,r=0.800,N=    0>
#> 
 [ 22 ], 0.7947222 Decor Dimension: 224 Nused: 224 . Cor to Base: 56 , ABase: 1 , Outcome Base: 0 
#> 
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]

pander::pander(sum(apply(dataframe[,varlist],2,var)))

5.5

pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))

0.0295

pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))

5.08

pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))

0.643

1.5.1 The decorrelation matrix


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  
  UPSTM <- attr(DEdataframe,"UPSTM")
  
  gplots::heatmap.2(1.0*(abs(UPSTM)>0),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Decorrelation matrix",
                    cexRow = cexheat,
                    cexCol = cexheat,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="|Beta|>0",
                    xlab="Output Feature", ylab="Input Feature")
  
  par(op)
}

1.6 The heatmap of the decorrelated data

if (!largeSet)
{

  hm <- heatMaps(data=DEdataframe[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 cexRow = cexheat,
                 cexCol = cexheat,
                 srtCol=45,
                 srtRow=45,
                 xlab="Feature",
                 ylab="Sample")
  par(op)
}

1.7 The correlation matrix after decorrelation

if (!largeSet)
{

  cormat <- cor(DEdataframe[,varlistc],method="pearson")
  cormat[is.na(cormat)] <- 0
  
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Correlation after IDeA",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  
  par(op)
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.7947222

1.8 U-MAP Visualization of features

1.8.1 The UMAP based on LASSO on Raw Data


if (nrow(dataframe) < 1000)
{
  classes <- unique(dataframe[1:numsub,outcome])
  raincolors <- rainbow(length(classes))
  names(raincolors) <- classes
  datasetframe.umap = umap(scale(dataframe[1:numsub,varlist]),n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
  text(datasetframe.umap$layout,labels=dataframe[1:numsub,outcome],col=raincolors[dataframe[1:numsub,outcome]+1])
}

1.8.2 The decorralted UMAP

if (nrow(dataframe) < 1000)
{

  datasetframe.umap = umap(scale(DEdataframe[1:numsub,varlistc]),n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
  text(datasetframe.umap$layout,labels=DEdataframe[1:numsub,outcome],col=raincolors[DEdataframe[1:numsub,outcome]+1])
}

1.9 Univariate Analysis

1.9.1 Univariate



univarRAW <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               dataframe,
               rankingTest="AUC")

100 : V_1064 200 : V_854




univarDe <- uniRankVar(varlistc,
               paste(outcome,"~1"),
               outcome,
               DEdataframe,
               rankingTest="AUC",
               )

100 : La_V_1064 200 : La_V_854

1.9.2 Final Table


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")

##topfive
topvar <- c(1:length(varlist)) <= TopVariables
pander::pander(univarRAW$orderframe[topvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
V_908 0.221 0.128 0.261 0.117 0.579 0.596
V_906 0.220 0.127 0.261 0.117 0.585 0.596
V_904 0.220 0.127 0.261 0.117 0.592 0.596
V_892 0.219 0.127 0.261 0.121 0.626 0.596
V_890 0.219 0.127 0.261 0.121 0.616 0.596
V_888 0.219 0.127 0.261 0.122 0.603 0.596
V_912 0.223 0.129 0.263 0.117 0.604 0.595
V_910 0.222 0.128 0.262 0.117 0.587 0.595
V_896 0.220 0.127 0.261 0.120 0.620 0.595
V_894 0.219 0.127 0.261 0.121 0.625 0.595


topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]

theLaVar <- rownames(finalTable)[str_detect(rownames(finalTable),"La_")]

pander::pander(univarDe$orderframe[topLAvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
La_V_984 -1.77e-04 5.79e-04 5.00e-04 6.60e-04 0.0433 0.810
La_V_926 3.01e-06 1.00e-05 1.32e-05 1.06e-05 0.8339 0.791
La_V_1204 -1.17e-06 1.26e-05 -1.46e-05 1.21e-05 0.8554 0.789
La_V_888 -4.82e-05 4.51e-04 2.98e-04 2.26e-04 0.5083 0.782
La_V_1110 2.65e-05 8.94e-05 -5.42e-05 9.75e-05 0.0657 0.778
La_V_924 -1.77e-05 6.72e-04 -4.71e-04 6.34e-04 0.1659 0.778
La_V_1214 5.07e-04 1.13e-03 1.42e-03 8.02e-04 0.7269 0.762
La_V_964 1.27e-03 2.00e-03 -5.81e-04 2.04e-03 0.3591 0.759
La_V_1172 3.52e-06 3.58e-04 1.13e-04 1.78e-04 0.2284 0.742
La_V_1096 -2.02e-03 1.59e-02 -9.15e-03 1.07e-02 0.1704 0.733

dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")

theSigDc <- dc[theLaVar]
names(theSigDc) <- NULL
theSigDc <- unique(names(unlist(theSigDc)))


theFormulas <- dc[rownames(finalTable)]
deFromula <- character(length(theFormulas))
names(deFromula) <- rownames(finalTable)

pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
mean total fraction
4.83 223 0.996


allSigvars <- names(dc)



dx <- names(deFromula)[1]
for (dx in names(deFromula))
{
  coef <- theFormulas[[dx]]
  cname <- names(theFormulas[[dx]])
  names(cname) <- cname
  for (cf in names(coef))
  {
    if (cf != dx)
    {
      if (coef[cf]>0)
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("+ %5.3f*%s",coef[cf],cname[cf]))
      }
      else
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("%5.3f*%s",coef[cf],cname[cf]))
      }
    }
  }
}

finalTable <- rbind(finalTable,univarRAW$orderframe[theSigDc[!(theSigDc %in% rownames(finalTable))],univariate_columns])


orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- deFromula[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]

Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")

finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
  DecorFormula caseMean caseStd controlMean controlStd controlKSP ROCAUC RAWAUC fscores
La_V_984 + 0.025V_1138 -0.040V_1032 + 1.000V_984 -0.987V_982 -1.77e-04 5.79e-04 5.00e-04 6.60e-04 0.0433 0.810 0.582 -3
La_V_926 + 0.138V_930 -0.604V_928 + 1.000V_926 -0.752V_924 + 0.217V_922 -0.000V_878 3.01e-06 1.00e-05 1.32e-05 1.06e-05 0.8339 0.791 0.589 -5
La_V_1204 -0.000V_1300 -0.262V_1206 + 1.000V_1204 -1.415V_1202 + 0.882V_1200 -0.205V_1198 -1.17e-06 1.26e-05 -1.46e-05 1.21e-05 0.8554 0.789 0.560 -5
La_V_888 + 1.000V_888 -1.657V_884 + 0.655*V_878 -4.82e-05 4.51e-04 2.98e-04 2.26e-04 0.5083 0.782 0.596 1
La_V_1110 -0.001V_1138 + 1.000V_1110 -1.754V_1108 + 0.806V_1104 + 1.084V_1096 -1.962V_1094 + 0.826*V_1092 2.65e-05 8.94e-05 -5.42e-05 9.75e-05 0.0657 0.778 0.561 -5
La_V_924 + 1.000V_924 -0.999V_922 -0.005*V_878 -1.77e-05 6.72e-04 -4.71e-04 6.34e-04 0.1659 0.778 0.588 1
La_V_1214 + 0.040V_1300 -1.033V_1216 + 1.000*V_1214 5.07e-04 1.13e-03 1.42e-03 8.02e-04 0.7269 0.762 0.556 2
La_V_964 + 0.069V_1138 -1.035V_972 + 1.000V_964 -6.837V_878 + 6.813*V_876 1.27e-03 2.00e-03 -5.81e-04 2.04e-03 0.3591 0.759 0.585 0
La_V_1172 + 0.000V_1300 + 0.970V_1176 -1.971V_1174 + 1.000V_1172 3.52e-06 3.58e-04 1.13e-04 1.78e-04 0.2284 0.742 0.559 -1
La_V_1096 + 11.525V_1138 -12.543V_1136 + 1.000*V_1096 -2.02e-03 1.59e-02 -9.15e-03 1.07e-02 0.1704 0.733 0.559 13
V_888 NA 2.19e-01 1.27e-01 2.61e-01 1.22e-01 0.6032 0.596 0.596 NA
V_884 NA 2.20e-01 1.28e-01 2.61e-01 1.22e-01 0.6055 0.594 0.594 NA
V_876 NA 2.21e-01 1.30e-01 2.63e-01 1.25e-01 0.6117 0.592 0.592 NA
V_878 NA 2.21e-01 1.29e-01 2.62e-01 1.24e-01 0.6099 0.591 0.591 NA
V_926 NA 2.31e-01 1.35e-01 2.70e-01 1.21e-01 0.6041 0.589 0.589 NA
V_928 NA 2.31e-01 1.35e-01 2.69e-01 1.21e-01 0.6106 0.588 0.588 NA
V_924 NA 2.31e-01 1.35e-01 2.69e-01 1.20e-01 0.5986 0.588 0.588 NA
V_922 NA 2.30e-01 1.34e-01 2.69e-01 1.20e-01 0.5973 0.588 0.588 NA
V_930 NA 2.31e-01 1.35e-01 2.69e-01 1.21e-01 0.6155 0.585 0.585 NA
V_972 NA 2.52e-01 1.45e-01 2.97e-01 1.37e-01 0.8095 0.585 0.585 NA
V_964 NA 2.43e-01 1.40e-01 2.86e-01 1.29e-01 0.7058 0.585 0.585 NA
V_982 NA 2.61e-01 1.51e-01 3.09e-01 1.46e-01 0.7187 0.584 0.584 NA
V_984 NA 2.62e-01 1.52e-01 3.11e-01 1.47e-01 0.6947 0.582 0.582 NA
V_1198 NA 2.65e-01 1.52e-01 2.88e-01 1.24e-01 0.8432 0.567 0.567 NA
V_1200 NA 2.67e-01 1.53e-01 2.89e-01 1.25e-01 0.8221 0.566 0.566 NA
V_1176 NA 2.78e-01 1.58e-01 2.97e-01 1.28e-01 0.8323 0.565 0.565 NA
V_1138 NA 3.16e-01 1.84e-01 3.48e-01 1.61e-01 0.6581 0.561 0.561 NA
V_1202 NA 2.68e-01 1.54e-01 2.90e-01 1.26e-01 0.8109 0.561 0.561 NA
V_1110 NA 3.43e-01 2.03e-01 3.81e-01 1.91e-01 0.4401 0.561 0.561 NA
V_1108 NA 3.44e-01 2.04e-01 3.82e-01 1.92e-01 0.4129 0.561 0.561 NA
V_1104 NA 3.46e-01 2.06e-01 3.85e-01 1.95e-01 0.3736 0.561 0.561 NA
V_1136 NA 3.19e-01 1.86e-01 3.51e-01 1.64e-01 0.6344 0.561 0.561 NA
V_1204 NA 2.69e-01 1.56e-01 2.92e-01 1.27e-01 0.7976 0.560 0.560 NA
V_1174 NA 2.81e-01 1.61e-01 3.00e-01 1.30e-01 0.8251 0.560 0.560 NA
V_1096 NA 3.52e-01 2.12e-01 3.90e-01 2.01e-01 0.3400 0.559 0.559 NA
V_1172 NA 2.85e-01 1.63e-01 3.03e-01 1.32e-01 0.8192 0.559 0.559 NA
V_1206 NA 2.71e-01 1.57e-01 2.93e-01 1.28e-01 0.7856 0.558 0.558 NA
V_1094 NA 3.54e-01 2.14e-01 3.92e-01 2.02e-01 0.3342 0.557 0.557 NA
V_1092 NA 3.56e-01 2.16e-01 3.93e-01 2.03e-01 0.3287 0.557 0.557 NA
V_1214 NA 2.77e-01 1.62e-01 2.96e-01 1.32e-01 0.7482 0.556 0.556 NA
V_1216 NA 2.78e-01 1.64e-01 2.97e-01 1.33e-01 0.7029 0.555 0.555 NA
V_1032 NA 3.20e-01 1.99e-01 3.41e-01 1.70e-01 0.6889 0.548 0.548 NA
V_1300 NA 2.88e-01 1.75e-01 3.00e-01 1.37e-01 0.5379 0.546 0.546 55

1.10 Comparing IDeA vs PCA vs EFA

1.10.1 PCA

featuresnames <- colnames(dataframe)[colnames(dataframe) != outcome]
pc <- prcomp(dataframe[,iscontinous],center = TRUE,tol=0.002)   #principal components
predPCA <- predict(pc,dataframe[,iscontinous])
PCAdataframe <- as.data.frame(cbind(predPCA,dataframe[,!iscontinous]))
colnames(PCAdataframe) <- c(colnames(predPCA),colnames(dataframe)[!iscontinous]) 
#plot(PCAdataframe[,colnames(PCAdataframe)!=outcome],col=dataframe[,outcome],cex=0.65,cex.lab=0.5,cex.axis=0.75,cex.sub=0.5,cex.main=0.75)

#pander::pander(pc$rotation)


PCACor <- cor(PCAdataframe[,colnames(PCAdataframe) != outcome])


  gplots::heatmap.2(abs(PCACor),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "PCA Correlation",
                    cexRow = 0.5,
                    cexCol = 0.5,
                     srtCol=45,
                     srtRow= -45,
                    key.title=NA,
                    key.xlab="Pearson Correlation",
                    xlab="Feature", ylab="Feature")

1.10.2 EFA


EFAdataframe <- dataframeScaled

if (length(iscontinous) < 2000)
{
  topred <- min(length(iscontinous),nrow(dataframeScaled),ncol(predPCA)/2)
  if (topred < 2) topred <- 2
  
  uls <- fa(dataframeScaled[,iscontinous],nfactors=topred,rotate="varimax",warnings=FALSE)  # EFA analysis
  predEFA <- predict(uls,dataframeScaled[,iscontinous])
  EFAdataframe <- as.data.frame(cbind(predEFA,dataframeScaled[,!iscontinous]))
  colnames(EFAdataframe) <- c(colnames(predEFA),colnames(dataframeScaled)[!iscontinous]) 


  
  EFACor <- cor(EFAdataframe[,colnames(EFAdataframe) != outcome])
  
  
    gplots::heatmap.2(abs(EFACor),
                      trace = "none",
    #                  scale = "row",
                      mar = c(5,5),
                      col=rev(heat.colors(5)),
                      main = "EFA Correlation",
                      cexRow = 0.5,
                      cexCol = 0.5,
                       srtCol=45,
                       srtRow= -45,
                      key.title=NA,
                      key.xlab="Pearson Correlation",
                      xlab="Feature", ylab="Feature")
}

1.11 Effect on CAR modeling

par(op)
par(xpd = TRUE)
dataframe[,outcome] <- factor(dataframe[,outcome])
rawmodel <- rpart(paste(outcome,"~."),dataframe,control=rpart.control(maxdepth=3))
pr <- predict(rawmodel,dataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(rawmodel,main="Raw",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(rawmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,dataframe[,outcome]==0))
  }


pander::pander(table(dataframe[,outcome],pr))
  0 1
0 30 0
1 17 14
pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.230 0.132 0.355
    tp 0.508 0.377 0.639
    se 0.452 0.273 0.640
    sp 1.000 0.884 1.000
    diag.ac 0.721 0.592 0.829
    diag.or Inf NA Inf
    nndx 2.214 1.563 6.351
    youden 0.452 0.157 0.640
    pv.pos 1.000 0.768 1.000
    pv.neg 0.638 0.485 0.773
    lr.pos Inf NA Inf
    lr.neg 0.548 0.398 0.755
    p.rout 0.770 0.645 0.868
    p.rin 0.230 0.132 0.355
    p.tpdn 0.000 0.000 0.116
    p.tndp 0.548 0.360 0.727
    p.dntp 0.000 0.000 0.232
    p.dptn 0.362 0.227 0.515
  • tab:

      Outcome + Outcome - Total
    Test + 14 0 14
    Test - 17 30 47
    Total 31 30 61
  • method: exact

  • digits: 2

  • conf.level: 0.95

pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.721 0.592 0.829
3 se 0.452 0.273 0.640
4 sp 1.000 0.884 1.000
6 diag.or Inf NA Inf

par(op)
par(xpd = TRUE)
DEdataframe[,outcome] <- factor(DEdataframe[,outcome])
IDeAmodel <- rpart(paste(outcome,"~."),DEdataframe,control=rpart.control(maxdepth=3))
pr <- predict(IDeAmodel,DEdataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(IDeAmodel,main="IDeA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(IDeAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,DEdataframe[,outcome]==0))
  }

pander::pander(table(DEdataframe[,outcome],pr))
  0 1
0 29 1
1 5 26
pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.4426 3.15e-01 0.576
    tp 0.5082 3.77e-01 0.639
    se 0.8387 6.63e-01 0.945
    sp 0.9667 8.28e-01 0.999
    diag.ac 0.9016 7.98e-01 0.963
    diag.or 150.8000 1.65e+01 1376.474
    nndx 1.2417 1.06e+00 2.038
    youden 0.8054 4.91e-01 0.945
    pv.pos 0.9630 8.10e-01 0.999
    pv.neg 0.8529 6.89e-01 0.950
    lr.pos 25.1613 3.64e+00 173.904
    lr.neg 0.1669 7.46e-02 0.373
    p.rout 0.5574 4.24e-01 0.685
    p.rin 0.4426 3.15e-01 0.576
    p.tpdn 0.0333 8.44e-04 0.172
    p.tndp 0.1613 5.45e-02 0.337
    p.dntp 0.0370 9.37e-04 0.190
    p.dptn 0.1471 4.95e-02 0.311
  • tab:

      Outcome + Outcome - Total
    Test + 26 1 27
    Test - 5 29 34
    Total 31 30 61
  • method: exact

  • digits: 2

  • conf.level: 0.95

pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.902 0.798 0.963
3 se 0.839 0.663 0.945
4 sp 0.967 0.828 0.999
6 diag.or 150.800 16.521 1376.474

par(op)
par(xpd = TRUE)
PCAdataframe[,outcome] <- factor(PCAdataframe[,outcome])
PCAmodel <- rpart(paste(outcome,"~."),PCAdataframe,control=rpart.control(maxdepth=3))
pr <- predict(PCAmodel,PCAdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
  plot(PCAmodel,main="PCA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
  text(PCAmodel, use.n = TRUE,cex=0.75)
  ptab <- epiR::epi.tests(table(pr==0,PCAdataframe[,outcome]==0))
}

pander::pander(table(PCAdataframe[,outcome],pr))
  0 1
0 23 7
1 2 29
pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.5902 0.45678 0.714
    tp 0.5082 0.37697 0.639
    se 0.9355 0.78578 0.992
    sp 0.7667 0.57716 0.901
    diag.ac 0.8525 0.73831 0.930
    diag.or 47.6429 9.02234 251.580
    nndx 1.4242 1.12013 2.755
    youden 0.7022 0.36295 0.893
    pv.pos 0.8056 0.63975 0.918
    pv.neg 0.9200 0.73969 0.990
    lr.pos 4.0092 2.08215 7.720
    lr.neg 0.0842 0.02171 0.326
    p.rout 0.4098 0.28550 0.543
    p.rin 0.5902 0.45678 0.714
    p.tpdn 0.2333 0.09934 0.423
    p.tndp 0.0645 0.00791 0.214
    p.dntp 0.1944 0.08194 0.360
    p.dptn 0.0800 0.00984 0.260
  • tab:

      Outcome + Outcome - Total
    Test + 29 7 36
    Test - 2 23 25
    Total 31 30 61
  • method: exact

  • digits: 2

  • conf.level: 0.95

pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.852 0.738 0.930
3 se 0.935 0.786 0.992
4 sp 0.767 0.577 0.901
6 diag.or 47.643 9.022 251.580


par(op)

1.11.1 EFA


  EFAdataframe[,outcome] <- factor(EFAdataframe[,outcome])
  EFAmodel <- rpart(paste(outcome,"~."),EFAdataframe,control=rpart.control(maxdepth=3))
  pr <- predict(EFAmodel,EFAdataframe,type = "class")
  
  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(EFAmodel,main="EFA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(EFAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,EFAdataframe[,outcome]==0))
  }


  pander::pander(table(EFAdataframe[,outcome],pr))
  0 1
0 29 1
1 17 14
  pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.2459 0.144643 0.373
    tp 0.5082 0.376968 0.639
    se 0.4516 0.273165 0.640
    sp 0.9667 0.827831 0.999
    diag.ac 0.7049 0.574305 0.815
    diag.or 23.8824 2.880288 198.024
    nndx 2.3907 1.565401 9.901
    youden 0.4183 0.100996 0.639
    pv.pos 0.9333 0.680515 0.998
    pv.neg 0.6304 0.475485 0.768
    lr.pos 13.5484 1.897603 96.732
    lr.neg 0.5673 0.409359 0.786
    p.rout 0.7541 0.627059 0.855
    p.rin 0.2459 0.144643 0.373
    p.tpdn 0.0333 0.000844 0.172
    p.tndp 0.5484 0.360342 0.727
    p.dntp 0.0667 0.001686 0.319
    p.dptn 0.3696 0.232066 0.525
  • tab:

      Outcome + Outcome - Total
    Test + 14 1 15
    Test - 17 29 46
    Total 31 30 61
  • method: exact

  • digits: 2

  • conf.level: 0.95

  pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.705 0.574 0.815
3 se 0.452 0.273 0.640
4 sp 0.967 0.828 0.999
6 diag.or 23.882 2.880 198.024
  par(op)